###############################################################################################
## Carl Schmertmann
## 6 Nov 2013
##
## This script is intended to be run inside of other R programs, using source( )
## It reads and arranges input data, as follows
##  (1) reads the HFD rate and exposure data 
##  (2) appends similarly formatted non-HFD data from Australia, Belgium, Denmark,
##      Greece, Iceland, Italy, Japan, Korea, Luxembourg, New Zealand, Romania, and Singapore
##  (3) appends differently-formated data from Brazil
##
## Results that are ready for use by other programs after this script is run include:
##  (A) a series of lists, one per country, with ASFR and related data (as described just below)
##  (B) SVD components for schedule shapes [X]
##  (C) residual projection matrix [M]
##  (D) empirical covariance estimate for shape residuals [omega]
##  (E) empirical inverse covariance estimate for shape residuals [omega.plus]
###############################################################################################

## because each country has data over different calendar years and cohorts, 
## data are collected in lists:

rates    = list()   # age x cohort ASFR matrices for each country
crates   = list()   # age x cohort ASFR matrices for each country
cohort   = list()   # numeric vectors containing cohorts included in each country's data (e.g [1917,1918,...1995])
sched    = list()   # a characther version of cohort, with country names appended (mostly for pretty output)
complete = list()   # logical vector indicating which cohort schedules for this country are complete
expos    = list()   # age x cohort exposure matrices for each country
origin   = list()   # where did each country's data come from -- HFD, Myrsklya et al., Lima

age = 15:44         # which ages do we need for a 'complete' schedule?
A   = length(age)   # an important constant

################################### 
#  READ HFD DATA  
###################################

FF  <-  read.table("asfrVV.txt",skip=2,as.is=T, header=T)     # HFD rate estimates (cohort x age)
WW  <-  read.table("exposVV.txt",skip=2,as.is=T, header=T)    # HFD exposure data (cohort x age)

for (k in unique(FF$Code)) {
  
  ## selected subset for rates data
  sel = ( (FF$Code==k) & (FF$ARDY %in% age) )
  
  origin[[k]]    = "HFD download 2 Nov 2011"
  rates[[k]]     = tapply( FF$ASFR[sel], list( FF$ARDY[sel], FF$Cohort[sel]), sum) 
  
  complete[[k]]  = apply( rates[[k]], 2, function(x) !any(is.na(x)) )
  crates[[k]]    = apply( rates[[k]], 2, cumsum)
  cohort[[k]]    = as.numeric(dimnames(rates[[k]])[[2]])
  sched[[k]]     = paste(k, cohort[[k]], sep="")
  
  ## (possibly) different selected subset for exposure data
  sel            = ( (WW$Code==k) & (WW$ARDY %in% age) )
  expos[[k]]     = tapply( WW$Exposure[sel], list( WW$ARDY[sel], WW$Cohort[sel]), sum)  
  
} ## for k

################################################## 
#  READ additional HFD-style DATA 
#    for 12 more countries Belgium...Singapore
##################################################

FF <- read.csv("additional.data.asfr.csv", as.is=T, header=T)
WW <- read.csv("additional.data.expos.csv", as.is=T, header=T)

for (k in unique(FF$Code)) {
  
  ## selected subset for rates data
  sel = ( (FF$Code==k) & (FF$ARDY %in% age) )
  
  origin[[k]]    = "Myrskyla et al. Jan 2013, as in PDR 39(1)"
  rates[[k]]     = tapply( FF$ASFR[sel], list( FF$ARDY[sel], FF$Cohort[sel]), sum) 
  
  complete[[k]]  = apply( rates[[k]], 2, function(x) !any(is.na(x)) )
  crates[[k]]    = apply( rates[[k]], 2, cumsum)
  cohort[[k]]    = as.numeric(dimnames(rates[[k]])[[2]])
  sched[[k]]     = paste(k, cohort[[k]], sep="")
  
  ## (possibly) different selected subset for exposure data
  sel            = ( (WW$Code==k) & (WW$ARDY %in% age) )
  expos[[k]]     = tapply( WW$Exposure[sel], list( WW$ARDY[sel], WW$Cohort[sel]), sum)
  
} ## for k

################################################## 
#  READ BRAZILIAN DATA (formatted differently, 
#     without exposure data)
##################################################

B = read.csv("Brazil 1966-2010 ASFR.csv", as.is=T, skip=4, row.names=1)

brazil.ages  = 15:49
brazil.years = 1966:2010

brazil.cohorts = outer(-brazil.ages, brazil.years,"+" )
clist          = sort( unique( as.vector(brazil.cohorts)))
ncohort        = length(clist)

Brazil = matrix(NA, length(brazil.ages), ncohort, 
                dimnames=list(brazil.ages,clist))

AGE = matrix( brazil.ages[row(B)], ncol=ncol(B))

for (coh in clist) {
  # any non-missing rates
  ff =  as.matrix(B)[ brazil.cohorts==coh]
  aa =  AGE[ brazil.cohorts==coh]
  
  Brazil[paste(aa), paste(coh)] = ff
}

k = "Brazil"

origin[[k]]    = "personal communication from Everton Lima, Fall 2012"
rates[[k]]     = Brazil[paste(age),]   # limit to 15-44
complete[[k]]  = apply( rates$Brazil, 2, function(x) !any(is.na(x)) )
crates[[k]]    = apply( rates[[k]], 2, cumsum)
cohort[[k]]    = as.numeric(dimnames(rates[[k]])[[2]])
sched[[k]]     = paste(k, cohort[[k]], sep="")
expos[[k]]     = matrix( 500000, 30, length(cohort[[k]]), dimnames=list(age,cohort[[k]]))  # an arbitrary approx

###############################################################

full.name = c(
  AUT         = "Austria",
  BGR         = "Bulgaria",
  CAN         = "Canada",
  CZE         = "Czech Rep",
  EST         = "Estonia",
  FIN         = "Finland",
  FRATNP      = "France",
  DEUTNP      = "Germany",
  DEUTW       = "Western Germany",
  DEUTE       = "Eastern Germany",
  HUN         = "Hungary",
  LTU         = "Lithuania",
  NLD         = "Netherlands",
  PRT         = "Portugal",
  RUS         = "Russia",
  SVK         = "Slovakia",        
  SVN         = "Slovenia",
  SWE         = "Sweden",
  CHE         = "Switzerland",
  GBR_NP      = "Great Britain",
  GBRTENW     = "England and Wales",
  GBR_SCO     = "Scotland",
  GBR_NIR     = "Northern Ireland",
  USA         = "USA",   
  Australia   = "Australia",
  Belgium     = "Belgium",
  Denmark     = "Denmark",
  Greece      = "Greece",
  Iceland     = "Iceland",
  Italy       = "Italy",
  Japan       = "Japan",
  Korea       = "Korea",
  Luxembourg  = "Luxembourg",
  New_Zealand = "New Zealand",
  Romania     = "Romania",
  Singapore   = "Singapore",
  Brazil      = "Brazil"
)  
 
## calculate 
##     SVD components for schedule shapes [X]
##     residual projection matrix [M]
##     empirical covariance estimate for shape residuals [omega]
##     empirical inverse covariance estimate for shape residuals [omega.plus]

### CALIBRATE SHAPE PENALTY using data from cohorts born before 1950 ##########

## assemble the historical set of cohorts born 1900-1949 that have complete schedules
PHI = NULL

for (k in names(rates)) {
  keep = complete[[k]] & (cohort[[k]] %in% 1900:1949)
  tmp  = rates[[k]][, keep ]
  PHI  = cbind(PHI, tmp)
}

## Singular value decomposition PHI = U D V'
ss = svd(PHI)

## keep the first 3 components, with signs flipped to make basis functions prettier
X = ss$u[,1:3] %*% diag(c(-1,+1,-1))   

## orthogonal projection matrix for X components
M = diag(A) - X %*% solve( t(X) %*% X) %*% t(X)

## orthogonal residuals 
eps = M %*% PHI

## average outer product (=estimated covariance of shape residuals in the historical data)
omega      = eps %*% t(eps) / ncol(eps)

## slight cleanup -- the last several eigenvalues of omega should be zero, but because of 
##  precision errors they may be extremely small negative numbers.  Because of this,
##   calculate the Moore-Penrose inverse in a way that ensures zero eigenvalues.

nullity = qr(X)$rank
r       = ncol(omega) - nullity

ei = eigen(omega)
Ur = ei$vec[,1:r]        # cols of Ur are eigenvectors corr. to + eigenvalues
Dr = diag( ei$val[1:r])

## generalized inverse of omega
omega.plus = Ur %*% solve(Dr) %*% t(Ur)